TD DSA 2021 de Antoine Ly - rapport de Fabien Faivre


Setup

!pip install textblob
Requirement already satisfied: textblob in /usr/local/lib/python3.8/dist-packages (0.15.3)
Requirement already satisfied: nltk>=3.1 in /usr/local/lib/python3.8/dist-packages (from textblob) (3.6.2)
Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from nltk>=3.1->textblob) (4.56.2)
Requirement already satisfied: regex in /usr/local/lib/python3.8/dist-packages (from nltk>=3.1->textblob) (2020.11.13)
Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from nltk>=3.1->textblob) (1.0.1)
Requirement already satisfied: click in /usr/local/lib/python3.8/dist-packages (from nltk>=3.1->textblob) (7.1.2)
WARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv
!pip install emot
Requirement already satisfied: emot in /usr/local/lib/python3.8/dist-packages (2.1)
WARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv
!pip install wordcloud
Requirement already satisfied: wordcloud in /usr/local/lib/python3.8/dist-packages (1.8.1)
Requirement already satisfied: pillow in /usr/local/lib/python3.8/dist-packages (from wordcloud) (8.1.0)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.8/dist-packages (from wordcloud) (3.3.4)
Requirement already satisfied: numpy>=1.6.1 in /usr/local/lib/python3.8/dist-packages (from wordcloud) (1.20.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->wordcloud) (1.3.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib->wordcloud) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->wordcloud) (2.8.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /usr/local/lib/python3.8/dist-packages (from matplotlib->wordcloud) (2.4.7)
Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from cycler>=0.10->matplotlib->wordcloud) (1.15.0)
WARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv
#Temps et fichiers
import os
import warnings
import time
from datetime import timedelta

#Manipulation de données
import pandas as pd
import numpy as np


# Text
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams

from textblob import TextBlob
import string
import re
import spacy 
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

#Modélisation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier


#Evaluation
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_score, recall_score


#Visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud

#Tracking d'expérience
import mlflow
import mlflow.sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
#Cellule strictement technique qui permet de sauver les exigences pour recréer au besoin l'image docker du projet
!pip freeze > /mnt/docker/requirements.txt

Utilisation du package

Durent ce projet, certaines parties du code ont été re packagées dans un package propre au projet afin de factliter la lecture du core et permettre la réutilisabilité des développements

#Cette cellule permet d'appeler la version packagée du projet et d'en assurer le reload avant appel des fonctions
%load_ext autoreload
%autoreload 2
from dsa_sentiment.scripts.make_dataset import load_data
from dsa_sentiment.scripts.evaluate import eval_metrics
from dsa_sentiment.scripts.make_dataset import Preprocess_StrLower, Preprocess_transform_target

Configuration de l’experiment MLFlow

MLFlow sera utilisé comme outil de suivi et de stockage des expérimentatiosn réalisées

mlflow.tracking.get_tracking_uri()
'/mnt/experiments'

Chargement des données

!pwd
/mnt/rapport/notebooks
data_folder = os.path.join('/mnt', 'data', 'raw')
all_raw_files = [os.path.join(data_folder, fname)
                    for fname in os.listdir(data_folder)]
all_raw_files
['/mnt/data/raw/sample_submission.csv',
 '/mnt/data/raw/test.csv',
 '/mnt/data/raw/train.csv']
random_state=42

Il n’est pas possible de faire de l’imputation comme avec des champs numérique. Il convient donc de supprimer les tweets vides (dropNA=True).

On laisse 20% de données de côté dans un jeu de validation. Afin de simuler des conditions réelles d’expoitation, le classement des modèles se fera sur le jeu de validation uniquement sans toucher au jeu de test.

A l’issue du premier classement les modèles seront réentrainés sur train + validation avant d’être évalués sur le jeu de test

X_train, y_train, X_val, y_val = load_data(all_raw_files[2], split=True, test_size=0.2, random_state=random_state, dropNA=True)
X_train.head()
textID text selected_text
0 cb774db0d1 I`d have responded, if I were going I`d have responded, if I were going
1 549e992a42 Sooo SAD I will miss you here in San Diego!!! Sooo SAD
2 088c60f138 my boss is bullying me... bullying me
3 9642c003ef what interview! leave me alone leave me alone
4 358bd9e861 Sons of ****, why couldn`t they put them on t... Sons of ****,
print(f'le jeu d\'entraînement initial contient', X_train.shape[0] + X_val.shape[0] , 'lignes')
print(f'le jeu d\'entraînement retenu contient', X_train.shape[0] , 'lignes')
print(f'le jeu de validation retenu contient', X_val.shape[0] , 'lignes')
le jeu d'entraînement initial contient 27480 lignes
le jeu d'entraînement retenu contient 21984 lignes
le jeu de validation retenu contient 5496 lignes
y_train.head()
sentiment
0 neutral
1 negative
2 negative
3 negative
4 negative
X_test, y_test = load_data(all_raw_files[1], split=False, random_state=random_state, dropNA=True)
X_test.head()
textID text
0 f87dea47db Last session of the day http://twitpic.com/67ezh
1 96d74cb729 Shanghai is also really exciting (precisely -...
2 eee518ae67 Recession hit Veronique Branquinho, she has to...
3 01082688c6 happy bday!
4 33987a8ee5 http://twitpic.com/4w75p - I like it!!
print(f'le jeu de test contient', X_test.shape[0] , 'lignes')
le jeu de test contient 3534 lignes

Transformation initiales des données

Cette partie vise uniquement à sélectionner les colonnes dont nous nous servirons et à transcoder la cible au format souhaité.

# Dans ce projet on ne se servira que du champs `text`. On cherche toutefois à conserver le format pandas DataFrame
X_train = X_train[['text']]
X_val = X_val[['text']]
X_test = X_test[['text']]
X_train.head()
text
0 I`d have responded, if I were going
1 Sooo SAD I will miss you here in San Diego!!!
2 my boss is bullying me...
3 what interview! leave me alone
4 Sons of ****, why couldn`t they put them on t...

Préalable : transformation des sorties

On commence par transformer les cibles pour se conformer aux instructions

y_train = Preprocess_transform_target(y_train, columns_to_process=['sentiment'])
y_train.head()
sentiment
0 0
1 -1
2 -1
3 -1
4 -1
y_val = Preprocess_transform_target(y_val, ['sentiment'])
y_val.head()
sentiment
21984 -1
21985 -1
21986 0
21987 0
21988 0
y_test = Preprocess_transform_target(y_test, ['sentiment'])
y_test.head()
sentiment
0 0
1 1
2 -1
3 1
4 1

On exporte les données sous parquet pour avoir une source de vérité unique dans les notebooks

# Données explicatives
X_train.to_parquet('/mnt/data/interim/X_train.gzip',compression='gzip')
X_val.to_parquet('/mnt/data/interim/X_val.gzip',compression='gzip')
X_test.to_parquet('/mnt/data/interim/X_test.gzip',compression='gzip')

# Données à expliquer
y_train.to_parquet('/mnt/data/interim/y_train.gzip',compression='gzip')
y_val.to_parquet('/mnt/data/interim/y_val.gzip',compression='gzip')
y_test.to_parquet('/mnt/data/interim/y_test.gzip',compression='gzip')

EDA

On commence par nalyser l’équilibre des différentes classes de sentiments

df = pd.concat([X_train, y_train], axis=1)
df.head()
text polarity subjectivity neg neu pos compound sentiment
0 I`d have responded, if I were going 0.000000 0.0 0.000 1.000 0.0 0.0000 0
1 Sooo SAD I will miss you here in San Diego!!! -0.976562 1.0 0.474 0.526 0.0 -0.7437 -1
2 my boss is bullying me... 0.000000 0.0 0.494 0.506 0.0 -0.5994 -1
3 what interview! leave me alone 0.000000 0.0 0.538 0.462 0.0 -0.3595 -1
4 Sons of ****, why couldn`t they put them on t... 0.000000 0.0 0.000 1.000 0.0 0.0000 -1

Analyse de l’équilibre du jeu d’entrainement

fig = px.histogram(df, x="sentiment", color="sentiment", title = 'Nombre de tweets par sentiment')
fig.show()

Il existe un léger déséquilibre dans les classes en faveur des sentiments neutral

Pour la suite des travaux, on créée un corpus contenant la concaténation de tous les tweets d’une certaine tonalité.

def create_corpus(text_series):
    text = text_series.apply(lambda x : x.split())
    text = sum(text, [])
    return text
    
positive_text = create_corpus(df['text'][df['sentiment']=='positive'])
negative_text = create_corpus(df['text'][df['sentiment']=='negative'])
neutral_text = create_corpus(df['text'][df['sentiment']=='neutral'])

Il devient alors possible de crééer des histogrammes représentant la fréquence de N-grams dans un corpus =donné

def plot_freq_dist(text_corpus, nb=30, ngram=1, title=''):
    '''
    Plot the most common words
    
    inputs:
        text_corpus : a corpus of words
        nb : number of words to plot
        title : graph title
    
    returns:
        nothing, plots the graph
    
    '''

    freq_pos=Counter(ngrams(create_corpus(pd.Series(text_corpus)),ngram))
    pos_df = pd.DataFrame({
        "words":[' '.join(items) for items in list(freq_pos.keys())],
        "Count":list(freq_pos.values())
    })
    common_pos= pos_df.nlargest(columns="Count", n=30)

    fig = px.bar(common_pos, x="words", y="Count", labels={"words": "Words", "Count":"Frequency"}, title=title)
    fig.show();
plot_freq_dist(positive_text, title = 'Most common words associated with positive tweets')

Le résultat montre la prépondérance des stopwords, ces mots d’articulation, qui sont très communs et gènent l’identifiaction de mots clefs propres à un document / ensemble de documents spécifiques.

Il convient donc d’effectuer des opérations de retraitement du texte pour analyse.

Preprocessing

Parmi les éléments propres aux tweets qui peuvent avoir un impact sur la suite on compte :

  • les mots clefs marqués par un #

  • les noms d’utilisateurs commençant par un @

  • les emoticons et emojis

  • les nombre de mots en MAJUSCULES

  • la répétition de caractères pour marquer l’emphase !!!!, looooong, ou l’autocensure f***

  • les fautes de frappes (mots de moins de 2 caractères)

Afin de disposer de traitements homogènes, repoductibles et paramétrables, une fonction spécifique est créée. Les différenst paramètres pourront être testés dans les phase de modélistaion ultérieures.

source preprocess

def preprocess_text(text_series, 
                    apply_lemmatizer=True,
                    apply_lowercase=True,
                    apply_url_standerdisation=True,
                    apply_user_standerdisation=True,
                    apply_emoticon_to_words=True,
                    apply_stopwords_removal=True,
                    apply_shortwords_removal=True,
                    apply_non_alphabetical_removal=True,
                    apply_only_2_consecutive_charac=True
                   
                   ):
    '''
    Main preprocess function
    
    inputs:
        text_series : a pandas Series object with text to preprocess
    
    outputs:
        a preprocessed pandas Series object
    '''
    
    processedText = []
    
    if apply_lemmatizer:
        # Create Lemmatizer and Stemmer.
        wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = r"[^(\w|\*|(!){2}|#)]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in text_series:
        
        if apply_lowercase:
            tweet = tweet.lower()
        
        if apply_url_standerdisation:
            # Replace all URls with 'URL'
            tweet = re.sub(urlPattern,' URL',tweet)
        
        if apply_user_standerdisation:
            # Replace @USERNAME to 'USER'.
            tweet = re.sub(userPattern,' USER', tweet)  
        
        if apply_emoticon_to_words:
            # Replace all emojis.
            for emo in EMOTICONS:
                #refactor outputs so that we come up with a single word when/if text spliting afterwards
                val = "_".join(EMOTICONS[emo].replace(",","").split())
                val='EMO_'+val
                tweet = tweet.replace(emo, ' '+val+' ')

            for emot in UNICODE_EMO:
                val = "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split())
                val='EMO_'+val
                tweet = tweet.replace(emo, ' '+val+' ')
      
        if apply_only_2_consecutive_charac:
            # Replace 3 or more consecutive letters by 2 letter.
            tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        if apply_non_alphabetical_removal:
            # Replace all non alphabets.
            tweet = re.sub(alphaPattern, " ", tweet)
        

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            if apply_stopwords_removal: 
                if word in stopwords.words('english'):
                    word=''
            else:
                word=word
            #if word not in stopwordlist:
            if apply_shortwords_removal:
                if len(word)<=1:
                    word=''
            else:
                word=word
            # Lemmatizing the word.
            if apply_lemmatizer:
                word = wordLemm.lemmatize(word)
            else:
                word=word
            
            tweetwords += (word+' ')

        processedText.append(tweetwords)
        
#    return pd.Series(processedText)
    return processedText
positive_text_2 = preprocess_text(df['text'][df['sentiment']=='positive'], apply_lemmatizer=False, apply_non_alphabetical_removal=True)
neutral_text_2 = preprocess_text(df['text'][df['sentiment']=='neutral'], apply_lemmatizer=False, apply_non_alphabetical_removal=True)
negative_text_2 = preprocess_text(df['text'][df['sentiment']=='negative'], apply_lemmatizer=False, apply_non_alphabetical_removal=True)

Analyses des mots clefs des tweets positifs

La fonction suivant permettra de réaliser des nuages de mots à partir d’un corpus

def plotWc(text, stopwords=None, title=''):
    wc = WordCloud(
            stopwords=stopwords,
            width=800,
            height=400,
            max_words=1000,
            random_state=44,
            background_color="white",
            collocations=False
    ).generate(text)
    
    plt.figure(figsize = (10,10))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()
plotWc(" ".join(positive_text_2), stopwords=stopwords.words('english'), title = "Wordcloud des tweets positifs")
../_images/Projet_FF-Copy1_62_0.png

Les tweets positifs sont a priori marqués par la forte reprétsentation de mots à connotation positive love, good, happy.

Cest a priori graphique peut être confirmé par un graphique de fréquence des mots individuels les plus présents

plot_freq_dist(create_corpus(positive_text_2), title = 'Most common words associated with positive tweets')
plot_freq_dist(create_corpus(positive_text_2), ngram=2, title = 'Most common 2grams associated with positive tweets')
plot_freq_dist(create_corpus(positive_text_2), ngram=3, title = 'Most common 3grams associated with positive tweets')
plot_freq_dist(create_corpus(positive_text_2), ngram=4, title = 'Most common 4grams associated with positive tweets')

[insight] : Une grande majorité de tweets positifs se rapportent soit à la fête des mère, soit au 4 Mai du fait du jeu de mot avec Star Wars…

Cette spécificité sera surement exploitée par les modèles comme un marqueur probable de tweets positifs.

Analyse des mots clefs des tweets neutres

plotWc(" ".join(neutral_text_2), stopwords=stopwords.words('english'), title = "Wordcloud des tweets neutres")
../_images/Projet_FF-Copy1_70_0.png
plot_freq_dist(create_corpus(neutral_text_2), title = 'Most common words associated with neutral tweets')

[Insight] On peut déjà remarquer que le mot day, qui est le plus fréquent des mots clefs des tweets positifs apparaît aussi en 6ème position des mots neutres.

plot_freq_dist(create_corpus(neutral_text_2), ngram=2, title = 'Most common 2grams associated with neutral tweets')
plot_freq_dist(create_corpus(neutral_text_2), ngram=3, title = 'Most common 3grams associated with neutral tweets')
plot_freq_dist(create_corpus(neutral_text_2), ngram=4, title = 'Most common 4grams associated with neutral tweets')

[insight] : On voit une source de confusion arriver avec les twwets neutres dans la mesure où une proportion significative de ceux-ci se rapportent aussi à la fête des mères et star wars.

Analyse des mots clefs des tweets négatifs

plotWc(" ".join(negative_text_2), stopwords=stopwords.words('english'), title = "Wordcloud des tweets négatifs")
../_images/Projet_FF-Copy1_78_0.png
plot_freq_dist(create_corpus(negative_text_2), title = 'Most common words associated with negative tweets')
plot_freq_dist(create_corpus(negative_text_2), ngram=2, title = 'Most common 2grams associated with negative tweets')
plot_freq_dist(create_corpus(negative_text_2), ngram=3, title = 'Most common 3grams associated with negative tweets')
plot_freq_dist(create_corpus(negative_text_2), ngram=4, title = 'Most common 4grams associated with negative tweets')

[insight] : on observe l’utilisation de mots autocensurés (**) et de mots très chargés (hate) Il ne servira à rien de tester des n-gram de dimension 4 ou plus : le nombre d’occurences est trop faible

def list_words_with(text_series, search='', nb=30):
    '''
    Cette fonction permet de lister les mots dans un string qui contiennent une certaine chaîne de caractères
    
    inputs :
        - text_series : un pd.Series contennat les chaînes de caractères
        - search : la séquence à rechercher
        - nb : ressortir les nb occurences les plus fréquentes
    
    output :
        - une liste de tuples contenant 
            + le mot contenant la séquence recherchée
            + le nombre d'occurence dans text_series
    
    '''
    
    
    #searchPattern   = f"\w*{search}\w*"
    searchPattern   = f"\w*{search}\w* "
    
    cnt = Counter()
    
    for tweet in text_series:
        # Replace all URls with 'URL'
        tweet = re.findall(searchPattern,tweet)
        for word in tweet:
            cnt[word] += 1
    return cnt.most_common(nb)
    
#liste des mots incluant auto-censure **
list_words_with(negative_text_2, search='\*{2}')
[('** ', 350),
 ('bl**y ', 1),
 ('_megan_** ', 1),
 ('**i ', 1),
 ('_da_** ', 1),
 ('j**i ', 1),
 ('_** ', 1),
 ('f** ', 1)]
#nombre d'utilisateurs
list_words_with(negative_text_2, search='USER')
[('USER ', 18)]
#nombre d'URLs
list_words_with(negative_text_2, search='URL')
[('URL ', 174)]
#liste des émojis
list_words_with(negative_text_2, search='EMO\w+')
[('EMO_Happy_face_smiley ', 17),
 ('EMO_Tongue_sticking_out_cheeky_playful_or_blowing_a_raspberry ', 16),
 ('EMO_Skeptical_annoyed_undecided_uneasy_or_hesitant ', 14),
 ('EMO_Frown_sad_andry_or_pouting ', 10),
 ('EMO_Surprise ', 9),
 ('EMO_Sealed_lips_or_wearing_braces_or_tongue ', 1),
 ('EMO_Embarrassed_or_blushing ', 1),
 ('EMO_Shock ', 1),
 ('EMO_Sad_or_Crying ', 1)]
#les mots qui incluents !!
list_words_with(negative_text_2, search='!!')
[('!! ', 51),
 ('me!! ', 13),
 ('sucks!! ', 7),
 ('you!! ', 6),
 ('it!! ', 6),
 ('today!! ', 5),
 ('again!! ', 4),
 ('now!! ', 4),
 ('hurts!! ', 4),
 ('up!! ', 4),
 ('starving!! ', 3),
 ('bed!! ', 3),
 ('time!! ', 3),
 ('ugh!! ', 3),
 ('that!! ', 3),
 ('sleep!! ', 3),
 ('home!! ', 3),
 ('noo!! ', 3),
 ('sorry!! ', 3),
 ('omg!! ', 3),
 ('blows!! ', 2),
 ('night!! ', 2),
 ('top!! ', 2),
 ('no!! ', 2),
 ('there!! ', 2),
 ('ughh!! ', 2),
 ('badly!! ', 2),
 ('house!! ', 2),
 ('guys!! ', 2),
 ('working!! ', 2)]
#les tweets complets qui incluent 'bs' (apparaît dans les 4grams)
list_words_with(negative_text_2, search='[\w ]* bs [\w ]*')
[('soo   kinda  sick  tired   bs  guys dish  ', 1),
 ('soo   kind  sick  tired   bs  guys dish  ', 1),
 ('fudge  bs   whole paper  tired ugh  hate school time  ', 1)]
#listing des mots clefs
list_words_with(negative_text_2, search='#[(\w*|\d*)]+')
[('#bgt ', 8),
 ('#fb ', 4),
 ('#itsucks ', 3),
 ('#fail ', 3),
 ('#fieldnotes ', 2),
 ('#sad ', 2),
 ('#beatwittyparty ', 2),
 ('#fixreplies ', 2),
 ('#liesboystell ', 2),
 ('#hunchback ', 1),
 ('#screengrab09 ', 1),
 ('#obs ', 1),
 ('#comet09 ', 1),
 ('#tcot ', 1),
 ('#antiboyle ', 1),
 ('#2923848932 ', 1),
 ('#linux ', 1),
 ('#ubuntu ', 1),
 ('#nontweetinggirlfriend ', 1),
 ('#brandwkshop ', 1),
 ('#followfriday ', 1),
 ('#beta1 ', 1),
 ('#livescribe ', 1),
 ('#luto ', 1),
 ('#rip ', 1),
 ('#rda2009cla ', 1),
 ('#britainsgottalent ', 1),
 ('#gm ', 1),
 ('#1 ', 1),
 ('#followfridays ', 1)]
def user_names(text_list):
    cnt = Counter()
    for text in text_list:
        for word in text.split():
            if word.startswith('@'):
                cnt[word] += 1
    return cnt
    
user_names(positive_text)
Counter({'@_agressiva23': 1,
         '@_TWEE': 1,
         '@': 44,
         '@_Mintyfresh': 1,
         '@_josh_thomas': 1,
         '@_catchfire': 1,
         '@_elj': 1,
         '@_Flik_': 1,
         '@_writersblock_': 1,
         '@_chloe': 1,
         '@_anh': 1,
         '@_mamalaura': 1,
         '@_careerbuilder_': 1,
         '@_Bella_Cullen13': 1,
         '@>--->--->---': 1})
user_names(positive_text_2)
Counter()
user_names(negative_text_2)
Counter()
user_names(neutral_text_2)
Counter()

Préalable : transformation des sorties

y_train = Preprocess_transform_target(y_train, columns_to_process=['sentiment'])
y_train.head()
sentiment
0 0
1 -1
2 -1
3 -1
4 -1
y_val = Preprocess_transform_target(y_val, ['sentiment'])
y_val.head()
NameErrorTraceback (most recent call last)
<ipython-input-21-d812305b1ab2> in <module>
----> 1 y_val = Preprocess_transform_target(y_val, ['sentiment'])
      2 y_val.head()

NameError: name 'y_val' is not defined
y_test = Preprocess_transform_target(y_test, ['sentiment'])
y_test.head()
sentiment
0 0
1 1
2 -1
3 1
4 1

Modélisation

Configuration de l’experiment MLFlow

On commence par définir une fonction générique qui sera en capacité d’ajuster, optimiser et logger dans MLFlow les résultats de pipelines qui seront produits pour chaque essai

La cellule suivante permet de créer des étapes de sélection de colonnes dans les Data Frame en entrée

Le mode de fonctionnement souhaité consiste à

from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]
def score_estimator(
    estimator, X_train, X_test, df_train, df_test, target_col
):
    """
    Evaluate an estimator on train and test sets with different metrics
        
    """

    metrics = [
        ("f1_macro", f1_score),   
        ("precision_macro", precision_score),
        ("recall_macro", recall_score),
        
    ]
    
    res = []
    for subset_label, X, df in [
        ("train", X_train, df_train),
        ("test", X_test, df_test),
    ]:
        y = df[target_col]
        y_pred = estimator.predict(X)
        for score_label, metric in metrics:
            score = metric(y, y_pred, average='macro')
            res.append(
                {"subset": subset_label, "metric": score_label, "score": score}
            )

    res = (
        pd.DataFrame(res)
        .set_index(["metric", "subset"])
        .score.unstack(-1)
        .round(4)
        .loc[:, ['train', 'test']]
    )
    return res
def scores_to_dict(score_df):
    d = score_df['train'].to_dict()
    d1 = dict(zip([x+'_train_' for x in  list(d.keys())], list(d.values())))
    d = score_df['test'].to_dict()
    d2 = dict(zip([x+'_test' for x in  list(d.keys())], list(d.values())))
    d1.update(d2)
    return d1
# Create function so that we could reuse later
def plot_cm(y_test, y_pred, target_names=[-1, 0, 1], 
            figsize=(5,3)):
    """Create a labelled confusion matrix plot."""
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, annot=True, fmt='g', cmap='BuGn', cbar=False, 
                ax=ax)
    ax.set_title('Confusion matrix')
    ax.set_xlabel('Predicted')
    ax.set_xticklabels(target_names)
    ax.set_ylabel('Actual')
    ax.set_yticklabels(target_names, 
                       fontdict={'verticalalignment': 'center'});

Train

def target_params(pipe, dict_keyval):
    """
    Crée un dictionnaire constitué de tous les paramètres incluant 'pattern' d'un pipe et leur assigne une valeur unique
    """
    
    res={}
    for key in list(dict_keyval.keys()):
    
        target = "[a-zA-Z\_]+__" + key

        rs = re.findall(target, ' '.join(list(pipe.get_params().keys())))
        rs=dict.fromkeys(rs, dict_keyval[key])
        res.update(rs)
    return res
def trainPipelineMlFlow(mlf_XP, 
                        xp_name_iter, 
                        pipeline, 
                        X_train, y_train, X_test, y_test, 
                        target_col='sentiment', 
                        fixed_params={}, 
                        use_opti=False, iterable_params={}, n_iter=20):
    """
    Fonction générique permettant d'entrainer et d'optimiser un pipeline sklearn
    Les paramètres et résultats sont stockés dans MLFlow
    """
  
    mlflow.set_experiment(mlf_XP)

    with mlflow.start_run(run_name=xp_name_iter):
        
        start_time = time.monotonic()  
        
        warnings.filterwarnings("ignore")
        
        # fit pipeline
        pipeline.set_params(**fixed_params)
        if not use_opti:
            search = pipeline
        else:
            search = RandomizedSearchCV(estimator = pipeline, 
                                        param_distributions = iterable_params, 
                                        n_jobs = -1, 
                                        cv = 5, 
                                        scoring = 'f1_macro', 
                                        n_iter = n_iter)
        
        search.fit(X_train, y_train[target_col])
                
        # get params
        params_to_log = fixed_params #select initial params
        if use_opti:
            params_to_log.update(search.best_params_) #update for optimal solution
        mlflow.log_params(params_to_log)
        
        # Evaluate metrics
        y_pred=search.predict(X_test)
        score = score_estimator(estimator=search, 
                                         X_train=X_train, 
                                         X_test=X_test, 
                                         df_train=y_train, 
                                         df_test=y_test, 
                                         target_col=target_col
                                )
        
        # Print out metrics
        print('XP :', xp_name_iter, '\n')
        print('pipeline : \n', score, '\n')
        print("params: \n" % params_to_log, '\n')
        print("Confusion matrix: \n")
        plot_cm(y_test, search.predict(X_test))
        
        
        #r Report to MlFlow
        mlflow.log_metrics(scores_to_dict(score))
        mlflow.sklearn.log_model(pipeline, xp_name_iter)
        
        end_time = time.monotonic()
        elapsed_time = timedelta(seconds=end_time - start_time)
        print('elapsed time :', elapsed_time)
        mlflow.set_tag(key="elapsed_time", value=elapsed_time)   
        
        
        
    return search
        

Bag of Words avec Random Forest

bow_pipeline = Pipeline(
    steps=[
        ('coltext', TextSelector('text')), #Sélection de la colonne à transformer (corpus)
        ("tfidf", TfidfVectorizer()),
        ("classifier", RandomForestClassifier(n_jobs=-1)),
    ]
)
list(bow_pipeline.get_params().keys())
['memory',
 'steps',
 'verbose',
 'coltext',
 'tfidf',
 'classifier',
 'coltext__field',
 'tfidf__analyzer',
 'tfidf__binary',
 'tfidf__decode_error',
 'tfidf__dtype',
 'tfidf__encoding',
 'tfidf__input',
 'tfidf__lowercase',
 'tfidf__max_df',
 'tfidf__max_features',
 'tfidf__min_df',
 'tfidf__ngram_range',
 'tfidf__norm',
 'tfidf__preprocessor',
 'tfidf__smooth_idf',
 'tfidf__stop_words',
 'tfidf__strip_accents',
 'tfidf__sublinear_tf',
 'tfidf__token_pattern',
 'tfidf__tokenizer',
 'tfidf__use_idf',
 'tfidf__vocabulary',
 'classifier__bootstrap',
 'classifier__ccp_alpha',
 'classifier__class_weight',
 'classifier__criterion',
 'classifier__max_depth',
 'classifier__max_features',
 'classifier__max_leaf_nodes',
 'classifier__max_samples',
 'classifier__min_impurity_decrease',
 'classifier__min_impurity_split',
 'classifier__min_samples_leaf',
 'classifier__min_samples_split',
 'classifier__min_weight_fraction_leaf',
 'classifier__n_estimators',
 'classifier__n_jobs',
 'classifier__oob_score',
 'classifier__random_state',
 'classifier__verbose',
 'classifier__warm_start']
trainPipelineMlFlow(
                    mlf_XP = "opti_F1",
                    xp_name_iter = "test", 
                    pipeline = bow_pipeline, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = {'classifier__random_state':42}
                    )
Bag Of Words - RF
params:
subset            train    test
metric                         
f1_macro         0.9991  0.6699
precision_macro  0.9991  0.6988
recall_macro     0.9991  0.6593
elapsed time : 0:00:02.763371
Pipeline(steps=[('coltext', TextSelector(field='text')),
                ('tfidf', TfidfVectorizer()),
                ('classifier',
                 RandomForestClassifier(n_jobs=-1, random_state=42))])
params = {
    "tfidf__use_idf": [True, False],
    "tfidf__ngram_range": [(1, 1), (1, 2), (1,3)],
    "classifier__bootstrap": [True, False],
    "classifier__class_weight": ["balanced", None],
    "classifier__n_estimators": [100, 300, 500, 800, 1200],
    "classifier__max_depth": [5, 8, 15, 25, 30],
    "classifier__min_samples_split": [2, 5, 10, 15, 100],
    "classifier__min_samples_leaf": [1, 2, 5, 10]
}

trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="Bag Of Words - RF-Opti - n_iter_30", 
                    pipeline=bow_pipeline, 
                    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
                    target_col='sentiment',
                    fixed_params={'classifier__random_state':42},
                    use_opti=True,
                    iterable_params=params,
                    n_iter=30
                    )
Bag Of Words - RF-Opti - n_iter_30
params:
subset            train    test
metric                         
f1_macro         0.7607  0.6938
precision_macro  0.7639  0.6958
recall_macro     0.7581  0.6921
elapsed time : 0:00:57.658394
RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('coltext',
                                              TextSelector(field='text')),
                                             ('tfidf', TfidfVectorizer()),
                                             ('classifier',
                                              RandomForestClassifier(n_jobs=-1,
                                                                     random_state=42))]),
                   n_iter=30, n_jobs=-1,
                   param_distributions={'classifier__bootstrap': [True, False],
                                        'classifier__class_weight': ['balanced',
                                                                     None],
                                        'classifier__max_depth': [5, 8, 15, 25,
                                                                  30],
                                        'classifier__min_samples_leaf': [1, 2,
                                                                         5,
                                                                         10],
                                        'classifier__min_samples_split': [2, 5,
                                                                          10,
                                                                          15,
                                                                          100],
                                        'classifier__n_estimators': [100, 300,
                                                                     500, 800,
                                                                     1200],
                                        'tfidf__ngram_range': [(1, 1), (1, 2),
                                                               (1, 3)],
                                        'tfidf__use_idf': [True, False]},
                   scoring='f1_macro')

Bag of Words avec régression logistique

bow_pipeline_LR = Pipeline(
    steps=[
        ('coltext', TextSelector('text')), #Sélection de la colonne à transformer (corpus)
        ("tfidf", TfidfVectorizer()),
        ("classifier", LogisticRegression(solver='liblinear', multi_class='auto')),
    ]
)
list(bow_pipeline_LR.get_params().keys())
['memory',
 'steps',
 'verbose',
 'coltext',
 'tfidf',
 'classifier',
 'coltext__field',
 'tfidf__analyzer',
 'tfidf__binary',
 'tfidf__decode_error',
 'tfidf__dtype',
 'tfidf__encoding',
 'tfidf__input',
 'tfidf__lowercase',
 'tfidf__max_df',
 'tfidf__max_features',
 'tfidf__min_df',
 'tfidf__ngram_range',
 'tfidf__norm',
 'tfidf__preprocessor',
 'tfidf__smooth_idf',
 'tfidf__stop_words',
 'tfidf__strip_accents',
 'tfidf__sublinear_tf',
 'tfidf__token_pattern',
 'tfidf__tokenizer',
 'tfidf__use_idf',
 'tfidf__vocabulary',
 'classifier__C',
 'classifier__class_weight',
 'classifier__dual',
 'classifier__fit_intercept',
 'classifier__intercept_scaling',
 'classifier__l1_ratio',
 'classifier__max_iter',
 'classifier__multi_class',
 'classifier__n_jobs',
 'classifier__penalty',
 'classifier__random_state',
 'classifier__solver',
 'classifier__tol',
 'classifier__verbose',
 'classifier__warm_start']
params = {
    "tfidf__use_idf": [True, False],
    "tfidf__ngram_range": [(1, 1), (1, 2), (1,3)]
}    

trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="Bag Of Words - LR-Opti - n_iter_30", 
                    pipeline=bow_pipeline_LR, 
                    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
                    target_col='sentiment',
                    fixed_params={'classifier__random_state':42},
                    use_opti=True,
                    iterable_params=params,
                    n_iter=30
                    )
Bag Of Words - LR-Opti - n_iter_30
params:
subset            train    test
metric                         
f1_macro         0.7833  0.7102
precision_macro  0.8000  0.7254
recall_macro     0.7746  0.7020
elapsed time : 0:00:07.454859
RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('coltext',
                                              TextSelector(field='text')),
                                             ('tfidf', TfidfVectorizer()),
                                             ('classifier',
                                              LogisticRegression(n_jobs=-1,
                                                                 random_state=42,
                                                                 solver='liblinear'))]),
                   n_iter=30, n_jobs=-2,
                   param_distributions={'tfidf__ngram_range': [(1, 1), (1, 2),
                                                               (1, 3)],
                                        'tfidf__use_idf': [True, False]},
                   scoring='f1_macro')
../_images/Projet_FF-Copy1_121_2.png
pipe = bow_pipeline_LR


params = target_params(pipe, {
    "use_idf": [True, False],
    "ngram_range": [(1, 1), (1, 2), (1,3), (1,4)]
})



trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="Bag Of Words - LR-Opti - n_iter_30", 
                    pipeline = pipe, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = True,
                    iterable_params = params,
                    n_iter = 30
                    )
XP : Bag Of Words - LR-Opti - n_iter_30 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.7833  0.7102
precision_macro  0.8000  0.7254
recall_macro     0.7746  0.7020 

params: 
 

Confusion matrix: 

elapsed time : 0:00:15.012709
RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('coltext',
                                              TextSelector(field='text')),
                                             ('tfidf', TfidfVectorizer()),
                                             ('classifier',
                                              LogisticRegression(n_jobs=-1,
                                                                 random_state=42,
                                                                 solver='liblinear'))]),
                   n_iter=30, n_jobs=-1,
                   param_distributions={'tfidf__ngram_range': [(1, 1), (1, 2),
                                                               (1, 3), (1, 4)],
                                        'tfidf__use_idf': [True, False]},
                   scoring='f1_macro')
../_images/Projet_FF-Copy1_122_2.png
pipe = bow_pipeline_LR_prepro


params = target_params(pipe, {
    "use_idf": [True, False],
    "ngram_range": [(1, 1), (1, 2), (1,3), (1,4)]
})



trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="Bag Of Words - LR-Opti - n_iter_30", 
                    pipeline = pipe, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = True,
                    iterable_params = params,
                    n_iter = 30
                    )
_RemoteTracebackTraceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/joblib/externals/loky/process_executor.py", line 404, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
AttributeError: 'WordListCorpusReader' object has no attribute '_unload'
"""

The above exception was the direct cause of the following exception:

BrokenProcessPoolTraceback (most recent call last)
<ipython-input-68-ab0cf34ebaa9> in <module>
      9 
     10 
---> 11 trainPipelineMlFlow(
     12                     mlf_XP="DSA_Tweets",
     13                     xp_name_iter="Bag Of Words - LR-Opti - n_iter_30",

<ipython-input-61-0ab31525d062> in trainPipelineMlFlow(mlf_XP, xp_name_iter, pipeline, X_train, y_train, X_test, y_test, target_col, fixed_params, use_opti, iterable_params, n_iter)
     31                                         n_iter = n_iter)
     32 
---> 33         search.fit(X_train, y_train[target_col])
     34 
     35         # get params

/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    839                 return results
    840 
--> 841             self._run_search(evaluate_candidates)
    842 
    843             # multimetric is determined here because in the case of a callable

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1617     def _run_search(self, evaluate_candidates):
   1618         """Search n_iter candidates from param_distributions"""
-> 1619         evaluate_candidates(ParameterSampler(
   1620             self.param_distributions, self.n_iter,
   1621             random_state=self.random_state))

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
    793                               n_splits, n_candidates, n_candidates * n_splits))
    794 
--> 795                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    796                                                        X, y,
    797                                                        train=train, test=test,

/usr/local/lib/python3.8/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1052 
   1053             with self._backend.retrieval_context():
-> 1054                 self.retrieve()
   1055             # Make sure that we get a last message telling us we are done
   1056             elapsed_time = time.time() - self._start_time

/usr/local/lib/python3.8/dist-packages/joblib/parallel.py in retrieve(self)
    931             try:
    932                 if getattr(self._backend, 'supports_timeout', False):
--> 933                     self._output.extend(job.get(timeout=self.timeout))
    934                 else:
    935                     self._output.extend(job.get())

/usr/local/lib/python3.8/dist-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

/usr/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
    437                 raise CancelledError()
    438             elif self._state == FINISHED:
--> 439                 return self.__get_result()
    440             else:
    441                 raise TimeoutError()

/usr/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
    386     def __get_result(self):
    387         if self._exception:
--> 388             raise self._exception
    389         else:
    390             return self._result

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="Bag Of Words - LR", 
                    pipeline=bow_pipeline_LR, 
                    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
                    target_col='sentiment',
                    fixed_params={'classifier__random_state':42}
                    )
Bag Of Words - LR
params:
subset            train    test
metric                         
f1_macro         0.7849  0.6956
precision_macro  0.8034  0.7147
recall_macro     0.7757  0.6864
elapsed time : 0:00:00.673433
Pipeline(steps=[('coltext', TextSelector(field='text')),
                ('tfidf', TfidfVectorizer()),
                ('classifier',
                 LogisticRegression(random_state=42, solver='liblinear'))])

TextPreprocessor

from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, 
                 apply_lemmatizer=True,
                 apply_lowercase=True,
                 apply_url_standerdisation=True,
                 apply_user_standerdisation=True,
                 apply_emoticon_to_words=True,
                 apply_stopwords_removal=True,
                 apply_shortwords_removal=True,
                 apply_non_alphabetical_removal=True,
                 apply_only_2_consecutive_charac=True):
        
        self.apply_lemmatizer = apply_lemmatizer
        self.apply_lowercase = apply_lowercase
        self.apply_url_standerdisation = apply_url_standerdisation
        self.apply_user_standerdisation = apply_user_standerdisation
        self.apply_emoticon_to_words = apply_emoticon_to_words
        self.apply_stopwords_removal = apply_stopwords_removal
        self.apply_shortwords_removal = apply_shortwords_removal
        self.apply_non_alphabetical_removal = apply_non_alphabetical_removal
        self.apply_only_2_consecutive_charac = apply_only_2_consecutive_charac
        
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        res= preprocess_text(X, 
                               apply_lemmatizer = self.apply_lemmatizer,
                               apply_lowercase = self.apply_lowercase,
                               apply_url_standerdisation = self.apply_url_standerdisation,
                               apply_user_standerdisation = self.apply_user_standerdisation,
                               apply_emoticon_to_words = self.apply_emoticon_to_words,
                               apply_stopwords_removal = self.apply_stopwords_removal,
                               apply_shortwords_removal = self.apply_shortwords_removal,
                               apply_non_alphabetical_removal = self .apply_non_alphabetical_removal,
                               apply_only_2_consecutive_charac = self.apply_only_2_consecutive_charac
                              )
        return res
bow_pipeline_LR_prepro = Pipeline(
    steps=[
        ('coltext', TextSelector('text')), #Sélection de la colonne à transformer (corpus)
        ('prepro', TextPreprocessor()), 
        ("tfidf", TfidfVectorizer()),
        ("classifier", LogisticRegression(solver='liblinear', multi_class='auto')),
    ]
)
list(bow_pipeline_LR_prepro.get_params().keys())
['memory',
 'steps',
 'verbose',
 'coltext',
 'prepro',
 'tfidf',
 'classifier',
 'coltext__field',
 'prepro__apply_emoticon_to_words',
 'prepro__apply_lemmatizer',
 'prepro__apply_lowercase',
 'prepro__apply_non_alphabetical_removal',
 'prepro__apply_only_2_consecutive_charac',
 'prepro__apply_shortwords_removal',
 'prepro__apply_stopwords_removal',
 'prepro__apply_url_standerdisation',
 'prepro__apply_user_standerdisation',
 'tfidf__analyzer',
 'tfidf__binary',
 'tfidf__decode_error',
 'tfidf__dtype',
 'tfidf__encoding',
 'tfidf__input',
 'tfidf__lowercase',
 'tfidf__max_df',
 'tfidf__max_features',
 'tfidf__min_df',
 'tfidf__ngram_range',
 'tfidf__norm',
 'tfidf__preprocessor',
 'tfidf__smooth_idf',
 'tfidf__stop_words',
 'tfidf__strip_accents',
 'tfidf__sublinear_tf',
 'tfidf__token_pattern',
 'tfidf__tokenizer',
 'tfidf__use_idf',
 'tfidf__vocabulary',
 'classifier__C',
 'classifier__class_weight',
 'classifier__dual',
 'classifier__fit_intercept',
 'classifier__intercept_scaling',
 'classifier__l1_ratio',
 'classifier__max_iter',
 'classifier__multi_class',
 'classifier__n_jobs',
 'classifier__penalty',
 'classifier__random_state',
 'classifier__solver',
 'classifier__tol',
 'classifier__verbose',
 'classifier__warm_start']
trainPipelineMlFlow(
                    mlf_XP = "DSA_Tweets",
                    xp_name_iter = "Bag Of Words - LRprepro", 
                    pipeline = bow_pipeline_LR_prepro, 
                    X_train = X_train , y_train = y_train , X_test = X_test , y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs': -1, 'random_state':42})
                    )
XP : Bag Of Words - LRprepro 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.7823  0.7030
precision_macro  0.7998  0.7192
recall_macro     0.7735  0.6945 

params: 
 

Confusion matrix: 

elapsed time : 0:02:15.036337
Pipeline(steps=[('coltext', TextSelector(field='text')),
                ('prepro', TextPreprocessor()), ('tfidf', TfidfVectorizer()),
                ('classifier',
                 LogisticRegression(n_jobs=-1, random_state=42,
                                    solver='liblinear'))])
../_images/Projet_FF-Copy1_129_2.png
target_params(bow_pipeline_LR_prepro, {'n_jobs': -1, 'random_state':42})
{'classifier__n_jobs': -1, 'classifier__random_state': 42}
pipe = bow_pipeline_LR_prepro

trainPipelineMlFlow(
                    mlf_XP = "DSA_Tweets",
                    xp_name_iter = "Bag Of Words - LRprepro", 
                    pipeline = pipe, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs': -1, 'random_state':42, 'apply_emoticon_to_words':False})
                    )
XP : Bag Of Words - LRprepro 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.7817  0.7029
precision_macro  0.7993  0.7196
recall_macro     0.7729  0.6943 

params: 
 

Confusion matrix: 

elapsed time : 0:01:01.560316
Pipeline(steps=[('coltext', TextSelector(field='text')),
                ('prepro', TextPreprocessor(apply_emoticon_to_words=False)),
                ('tfidf', TfidfVectorizer()),
                ('classifier',
                 LogisticRegression(n_jobs=-1, random_state=42,
                                    solver='liblinear'))])
../_images/Projet_FF-Copy1_131_2.png

Ici

params = target_params(pipe, {‘apply_emoticon_to_words’: [True, False] , ‘apply_lemmatizer’: [True, False], ‘apply_lowercase’: [True, False], ‘apply_non_alphabetical_removal’: [True, False], ‘apply_shortwords_removal’: [True, False], ‘apply_stopwords_removal’: [True, False], ‘apply_url_standerdisation’: [True, False], ‘apply_user_standerdisation’: [True, False] })

pipe = bow_pipeline_LR


params = target_params(pipe, 
                       {"use_idf": [True, False]}
                      )


pipe = bow_pipeline_LR_prepro

trainPipelineMlFlow(
                    mlf_XP = "DSA_Tweets",
                    xp_name_iter = "Bag Of Words - LRprepro - Opti", 
                    pipeline = pipe, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs': -1, 'random_state':42}),
                    use_opti = True,
                    iterable_params = params
                    )
_RemoteTracebackTraceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/joblib/externals/loky/process_executor.py", line 404, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
AttributeError: 'WordListCorpusReader' object has no attribute '_unload'
"""

The above exception was the direct cause of the following exception:

BrokenProcessPoolTraceback (most recent call last)
<ipython-input-85-ad48903f2eef> in <module>
      9 pipe = bow_pipeline_LR_prepro
     10 
---> 11 trainPipelineMlFlow(
     12                     mlf_XP = "DSA_Tweets",
     13                     xp_name_iter = "Bag Of Words - LRprepro - Opti",

<ipython-input-61-0ab31525d062> in trainPipelineMlFlow(mlf_XP, xp_name_iter, pipeline, X_train, y_train, X_test, y_test, target_col, fixed_params, use_opti, iterable_params, n_iter)
     31                                         n_iter = n_iter)
     32 
---> 33         search.fit(X_train, y_train[target_col])
     34 
     35         # get params

/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    839                 return results
    840 
--> 841             self._run_search(evaluate_candidates)
    842 
    843             # multimetric is determined here because in the case of a callable

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1617     def _run_search(self, evaluate_candidates):
   1618         """Search n_iter candidates from param_distributions"""
-> 1619         evaluate_candidates(ParameterSampler(
   1620             self.param_distributions, self.n_iter,
   1621             random_state=self.random_state))

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
    793                               n_splits, n_candidates, n_candidates * n_splits))
    794 
--> 795                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    796                                                        X, y,
    797                                                        train=train, test=test,

/usr/local/lib/python3.8/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1052 
   1053             with self._backend.retrieval_context():
-> 1054                 self.retrieve()
   1055             # Make sure that we get a last message telling us we are done
   1056             elapsed_time = time.time() - self._start_time

/usr/local/lib/python3.8/dist-packages/joblib/parallel.py in retrieve(self)
    931             try:
    932                 if getattr(self._backend, 'supports_timeout', False):
--> 933                     self._output.extend(job.get(timeout=self.timeout))
    934                 else:
    935                     self._output.extend(job.get())

/usr/local/lib/python3.8/dist-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

/usr/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
    437                 raise CancelledError()
    438             elif self._state == FINISHED:
--> 439                 return self.__get_result()
    440             else:
    441                 raise TimeoutError()

/usr/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
    386     def __get_result(self):
    387         if self._exception:
--> 388             raise self._exception
    389         else:
    390             return self._result

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
pipe = bow_pipeline_LR_prepro

params = target_params(pipe, {'apply_emoticon_to_words': [True, False],
                              'apply_lemmatizer': [True, False],
                              'apply_lowercase': [True, False],
                              'apply_non_alphabetical_removal': [True, False],
                              'apply_shortwords_removal': [True, False],
                              'apply_stopwords_removal': [True, False],
                              'apply_url_standerdisation': [True, False],
                              'apply_user_standerdisation': [True, False]
                              })

params
{'prepro__apply_emoticon_to_words': [True, False],
 'prepro__apply_lemmatizer': [True, False],
 'prepro__apply_lowercase': [True, False],
 'prepro__apply_non_alphabetical_removal': [True, False],
 'prepro__apply_shortwords_removal': [True, False],
 'prepro__apply_stopwords_removal': [True, False],
 'prepro__apply_url_standerdisation': [True, False],
 'prepro__apply_user_standerdisation': [True, False]}
pipe = bow_pipeline_LR_prepro


params = target_params(pipe, {
    "use_idf": [True, False]
})



trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="Bag Of Words - LR-Opti - n_iter_30", 
                    pipeline = pipe, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = True,
                    iterable_params = params,
                    n_iter = 30
                    )
_RemoteTracebackTraceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/joblib/externals/loky/process_executor.py", line 404, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
AttributeError: 'WordListCorpusReader' object has no attribute '_unload'
"""

The above exception was the direct cause of the following exception:

BrokenProcessPoolTraceback (most recent call last)
<ipython-input-89-9298ee19f84e> in <module>
      8 
      9 
---> 10 trainPipelineMlFlow(
     11                     mlf_XP="DSA_Tweets",
     12                     xp_name_iter="Bag Of Words - LR-Opti - n_iter_30",

<ipython-input-61-0ab31525d062> in trainPipelineMlFlow(mlf_XP, xp_name_iter, pipeline, X_train, y_train, X_test, y_test, target_col, fixed_params, use_opti, iterable_params, n_iter)
     31                                         n_iter = n_iter)
     32 
---> 33         search.fit(X_train, y_train[target_col])
     34 
     35         # get params

/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    839                 return results
    840 
--> 841             self._run_search(evaluate_candidates)
    842 
    843             # multimetric is determined here because in the case of a callable

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1617     def _run_search(self, evaluate_candidates):
   1618         """Search n_iter candidates from param_distributions"""
-> 1619         evaluate_candidates(ParameterSampler(
   1620             self.param_distributions, self.n_iter,
   1621             random_state=self.random_state))

/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
    793                               n_splits, n_candidates, n_candidates * n_splits))
    794 
--> 795                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    796                                                        X, y,
    797                                                        train=train, test=test,

/usr/local/lib/python3.8/dist-packages/joblib/parallel.py in __call__(self, iterable)
   1052 
   1053             with self._backend.retrieval_context():
-> 1054                 self.retrieve()
   1055             # Make sure that we get a last message telling us we are done
   1056             elapsed_time = time.time() - self._start_time

/usr/local/lib/python3.8/dist-packages/joblib/parallel.py in retrieve(self)
    931             try:
    932                 if getattr(self._backend, 'supports_timeout', False):
--> 933                     self._output.extend(job.get(timeout=self.timeout))
    934                 else:
    935                     self._output.extend(job.get())

/usr/local/lib/python3.8/dist-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

/usr/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
    437                 raise CancelledError()
    438             elif self._state == FINISHED:
--> 439                 return self.__get_result()
    440             else:
    441                 raise TimeoutError()

/usr/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
    386     def __get_result(self):
    387         if self._exception:
--> 388             raise self._exception
    389         else:
    390             return self._result

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
X_train_prepro = pd.DataFrame(preprocess_text(X_train['text']), columns=['text'])
X_train_prepro
text
0 responded going
1 soo sad miss san diego!!
2 bos bullying
3 interview! leave alone
4 son ** put release already bought
... ...
27475 wish could come see denver husband lost jo...
27476 wondered rake client made clear net ...
27477 yay good enjoy break probably need he...
27478 worth **
27479 flirting going atg smile yay ((hugs))

27480 rows × 1 columns

X_test_prepro = pd.DataFrame(preprocess_text(X_test['text']), columns=['text'])
X_test
text
0 Last session of the day http://twitpic.com/67ezh
1 Shanghai is also really exciting (precisely -...
2 Recession hit Veronique Branquinho, she has to...
3 happy bday!
4 http://twitpic.com/4w75p - I like it!!
... ...
3529 its at 3 am, im very tired but i can`t sleep ...
3530 All alone in this old house again. Thanks for...
3531 I know what you mean. My little dog is sinkin...
3532 _sutra what is your next youtube video gonna b...
3533 http://twitpic.com/4woj2 - omgssh ang cute n...

3534 rows × 1 columns

pipe = bow_pipeline_LR


params = target_params(pipe, {
    "use_idf": [True, False]
})



trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="Bag Of Words - LR-prepro", 
                    pipeline = pipe, 
                    X_train = X_train_prepro, y_train = y_train, X_test = X_test_prepro, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = True,
                    iterable_params = params,
                    n_iter = 30
                    )
XP : Bag Of Words - LR-prepro 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.7554  0.7031
precision_macro  0.7730  0.7201
recall_macro     0.7470  0.6943 

params: 
 

Confusion matrix: 

elapsed time : 0:00:02.456034
RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('coltext',
                                              TextSelector(field='text')),
                                             ('tfidf', TfidfVectorizer()),
                                             ('classifier',
                                              LogisticRegression(n_jobs=-1,
                                                                 random_state=42,
                                                                 solver='liblinear'))]),
                   n_iter=30, n_jobs=-1,
                   param_distributions={'tfidf__use_idf': [True, False]},
                   scoring='f1_macro')
../_images/Projet_FF-Copy1_141_2.png

PyTorch

import torch
torch.cuda.is_available()
True
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline


import numpy as np
from scipy.special import softmax
import csv
import urllib.request
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

model = AutoModelForSequenceClassification.from_pretrained('/mnt/pretrained_models/'+MODEL)
tokenizer = AutoTokenizer.from_pretrained('/mnt/pretrained_models/'+MODEL)
config = AutoConfig.from_pretrained('/mnt/pretrained_models/'+MODEL)
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]
nlp=pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0, return_all_scores=True)
def TorchTwitterRoBERTa_Pred(text = "Good night 😊"):
    text = preprocess(text)
    otpt = nlp(text)[0]
#    otpt = (list(otpt[i].values())[1] for i in range(len(otpt)))
    neg = otpt[0]['score']
    neu = otpt[1]['score']
    pos = otpt[2]['score']
    
#    NewName = {0:'roBERTa-neg', 1:'roBERTa-neu', 2:'roBERTa-pos'}
#    otpt = pd.json_normalize(otpt).transpose().rename(columns=NewName).reset_index().drop([0]).drop(columns=['index'])
    return neg, neu, pos
test = TorchTwitterRoBERTa_Pred()
test
(0.007609867490828037, 0.1458120346069336, 0.8465781211853027)
def run_loopy_roBERTa(df):
    v_neg, v_neu, v_pos = [], [], []
    for _, row in df.iterrows():
        v1, v2, v3 = TorchTwitterRoBERTa_Pred(row.values[0])
        v_neg.append(v1)
        v_neu.append(v2)
        v_pos.append(v3)
    df_result = pd.DataFrame({'roBERTa_neg': v_neg,
                              'roBERTa_neu': v_neu,
                              'roBERTa_pos': v_pos})
    return df_result
class clTwitterroBERTa(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        res = run_loopy_roBERTa(X[[self.field]])
        
        #self.res[['roBERTa_neg', 'roBERTa_neu', 'roBERTa_pos']] =  X[self.field].apply(lambda x : TorchTwitterRoBERTa_Pred(x)).apply(pd.Series)
        return res
        #return self.res
roBERTa_pipe=Pipeline([
                     ('roBERTa', clTwitterroBERTa(field='text'))
                    ])
roBERTa_RF_Pipe = Pipeline(
    steps=[
        ('roBERTa', roBERTa_pipe),
        ("classifier", RandomForestClassifier(n_jobs=-1))
    ]
)
pipe = roBERTa_RF_Pipe


trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="roBERTa - LR", 
                    pipeline = pipe, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42})
                    )
XP : roBERTa - LR 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.9999  0.7150
precision_macro  0.9999  0.7154
recall_macro     0.9999  0.7149 

params: 
 

Confusion matrix: 

elapsed time : 0:06:42.314466
Pipeline(steps=[('roBERTa',
                 Pipeline(steps=[('roBERTa', clTwitterroBERTa(field='text'))])),
                ('classifier',
                 RandomForestClassifier(n_jobs=-1, random_state=42))])
../_images/Projet_FF-Copy1_155_2.png

Transformation des données par roBERTa

import gc

gc.collect()

torch.cuda.empty_cache()
import torch
torch.cuda.empty_cache()
X_train_roBERTa = roBERTa_pipe.transform(X_train)
X_train_roBERTa
roBERTa_neg roBERTa_neu roBERTa_pos
0 0.064939 0.808318 0.126744
1 0.918158 0.066100 0.015742
2 0.924613 0.070741 0.004646
3 0.783082 0.192980 0.023938
4 0.564197 0.404574 0.031229
... ... ... ...
27475 0.434403 0.445122 0.120474
27476 0.139542 0.635024 0.225433
27477 0.003337 0.022629 0.974034
27478 0.053331 0.357756 0.588913
27479 0.012305 0.150569 0.837125

27480 rows × 3 columns

X_test_roBERTa = roBERTa_pipe.transform(X_test)
X_train_roBERTa.to_parquet('/mnt/data/interim/X_train_roBERTa.gzip',compression='gzip')
X_test_roBERTa.to_parquet('/mnt/data/interim/X_test_roBERTa.gzip',compression='gzip')
roBERTa_RF = Pipeline(
    steps=[
        ("classifier", RandomForestClassifier(n_jobs=-1))
    ]
)
pipe = roBERTa_RF

params = target_params(pipe, {
    "bootstrap": [True, False],
    "class_weight": ["balanced", None],
    "n_estimators": [100, 300, 500, 800, 1200],
    "max_depth": [5, 8, 15, 25, 30],
    "min_samples_split": [2, 5, 10, 15, 100],
    "min_samples_leaf": [1, 2, 5, 10]
})


roBERTa_RF_=trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="roBERTa - RF - opti - 30", 
                    pipeline = pipe, 
                    X_train = X_train_roBERTa, y_train = y_train, X_test = X_test_roBERTa, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = True,
                    iterable_params=params,
                    n_iter=30
                    )
XP : roBERTa - RF - opti - 30 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.7606  0.7444
precision_macro  0.7626  0.7451
recall_macro     0.7592  0.7446 

params: 
 

Confusion matrix: 

elapsed time : 0:02:09.753660
../_images/Projet_FF-Copy1_165_1.png

Essai combinaison de différentes méthodes

class Blob(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[['polarity', 'subjectivity']] =  X[self.field].apply(lambda x:TextBlob(x).sentiment).apply(pd.Series)
        return X[['polarity', 'subjectivity']]
blob_pipe=Pipeline([
                     ('blob', Blob(field='text'))
                    ])
X_train_Blob=blob_pipe.transform(X_train)
X_train_Blob.head()
polarity subjectivity
0 0.000000 0.0
1 -0.976562 1.0
2 0.000000 0.0
3 0.000000 0.0
4 0.000000 0.0
X_test_Blob=blob_pipe.transform(X_test)
X_test_Blob.head()
polarity subjectivity
0 0.0000 0.066667
1 0.1625 0.800000
2 0.0000 0.500000
3 1.0000 1.000000
4 0.0000 0.000000
X_train_Blob.to_parquet('/mnt/data/interim/X_train_Blob.gzip',compression='gzip')
X_test_Blob.to_parquet('/mnt/data/interim/X_test_Blob.gzip',compression='gzip')
class Vader(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
        sid = SentimentIntensityAnalyzer()
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        sid = SentimentIntensityAnalyzer()
        X[['neg', 'neu', 'pos', 'compound']] =  X[self.field].apply(sid.polarity_scores).apply(pd.Series)
        return X[['neg', 'neu', 'pos', 'compound']]
vader_pipe=Pipeline([
                     ('vader', Vader(field='text'))
                    ])
X_train_Vader=vader_pipe.transform(X_train)
X_train_Vader.head()
neg neu pos compound
0 0.000 1.000 0.0 0.0000
1 0.474 0.526 0.0 -0.7437
2 0.494 0.506 0.0 -0.5994
3 0.538 0.462 0.0 -0.3595
4 0.000 1.000 0.0 0.0000
X_test_Vader=vader_pipe.transform(X_test)
X_test_Vader.head()
neg neu pos compound
0 0.000 1.000 0.000 0.0000
1 0.000 0.670 0.330 0.7501
2 0.382 0.618 0.000 -0.7345
3 0.000 0.200 0.800 0.6114
4 0.000 0.393 0.607 0.4738
X_train_Vader.to_parquet('/mnt/data/interim/X_train_Vader.gzip',compression='gzip')
X_test_Vader.to_parquet('/mnt/data/interim/X_test_Vader.gzip',compression='gzip')
X_train_compound = pd.concat([X_train_roBERTa, X_train_Blob, X_train_Vader], axis=1)
X_test_compound = pd.concat([X_test_roBERTa, X_test_Blob, X_test_Vader], axis=1)
X_train_compound.head()
roBERTa_neg roBERTa_neu roBERTa_pos polarity subjectivity neg neu pos compound
0 0.064939 0.808318 0.126744 0.000000 0.0 0.000 1.000 0.0 0.0000
1 0.918158 0.066100 0.015742 -0.976562 1.0 0.474 0.526 0.0 -0.7437
2 0.924613 0.070741 0.004646 0.000000 0.0 0.494 0.506 0.0 -0.5994
3 0.783082 0.192980 0.023938 0.000000 0.0 0.538 0.462 0.0 -0.3595
4 0.564197 0.404574 0.031229 0.000000 0.0 0.000 1.000 0.0 0.0000
X_test_compound.head()
roBERTa_neg roBERTa_neu roBERTa_pos polarity subjectivity neg neu pos compound
0 0.034001 0.882237 0.083762 0.0000 0.066667 0.000 1.000 0.000 0.0000
1 0.001176 0.013178 0.985646 0.1625 0.800000 0.000 0.670 0.330 0.7501
2 0.908455 0.084444 0.007101 0.0000 0.500000 0.382 0.618 0.000 -0.7345
3 0.002410 0.013607 0.983983 1.0000 1.000000 0.000 0.200 0.800 0.6114
4 0.003367 0.030119 0.966514 0.0000 0.000000 0.000 0.393 0.607 0.4738
pipe = roBERTa_RF

params = target_params(pipe, {
    "bootstrap": [True, False],
    "class_weight": ["balanced", None],
    "n_estimators": [100, 300, 500, 800, 1200],
    "max_depth": [5, 8, 15, 25, 30],
    "min_samples_split": [2, 5, 10, 15, 100],
    "min_samples_leaf": [1, 2, 5, 10]
})


roBERTa_RF_=trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="roBERTa_Blob_Vader - RF - opti - 30", 
                    pipeline = pipe, 
                    X_train = X_train_compound, y_train = y_train, X_test = X_test_compound, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = True,
                    iterable_params=params,
                    n_iter=30
                    )
XP : roBERTa_Blob_Vader - RF - opti - 30 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.8065  0.7564
precision_macro  0.8100  0.7591
recall_macro     0.8037  0.7544 

params: 
 

Confusion matrix: 

elapsed time : 0:03:21.937315
../_images/Projet_FF-Copy1_182_1.png
import xgboost as xgb
roBERTa_xgb = Pipeline(
    steps=[
        ("classifier", xgb.XGBClassifier())
    ]
)
pipe = roBERTa_xgb

params = target_params(pipe, {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     })


roBERTa_xgb_ = trainPipelineMlFlow(
                    mlf_XP="DSA_Tweets",
                    xp_name_iter="roBERTa - xgb - opti", 
                    pipeline = pipe, 
                    X_train = X_train_compound, y_train = y_train, X_test = X_test_compound, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = True,
                    iterable_params=params,
                    n_iter=20
                    )
[17:18:59] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XP : roBERTa - xgb - opti 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.8116  0.7692
precision_macro  0.8137  0.7699
recall_macro     0.8099  0.7689 

params: 
 

Confusion matrix: 

elapsed time : 3:30:41.653581
../_images/Projet_FF-Copy1_185_1.png

Essai opti F1

pipe = bow_pipeline


essai_=trainPipelineMlFlow(
                    mlf_XP="opti F1",
                    xp_name_iter="test", 
                    pipeline = pipe, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
                    target_col = 'sentiment',
                    fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
                    use_opti = False
                    )
INFO: 'opti F1' does not exist. Creating a new experiment
XP : test 

pipeline : 
 subset            train    test
metric                         
f1_macro         0.9991  0.6890
precision_macro  0.9991  0.7190
recall_macro     0.9991  0.6779 

params: 
 

Confusion matrix: 

elapsed time : 0:00:04.605102
../_images/Projet_FF-Copy1_188_1.png
essai_.predict_proba(X_train)
array([[0.02, 0.96, 0.02],
       [0.87, 0.11, 0.02],
       [0.91, 0.08, 0.01],
       ...,
       [0.03, 0.06, 0.91],
       [0.05, 0.14, 0.81],
       [0.02, 0.84, 0.14]])
X_train.head()
text
0 I`d have responded, if I were going
1 Sooo SAD I will miss you here in San Diego!!!
2 my boss is bullying me...
3 what interview! leave me alone
4 Sons of ****, why couldn`t they put them on t...
for var in [-1, 0, 1]:
    plt.figure(figsize=(12,4))
    sns.distplot(essai_.predict_proba(X_train)[(y_train['sentiment']==var),0], bins=30, kde=False, 
                 color='green', label='Negative')
    sns.distplot(essai_.predict_proba(X_train)[(y_train['sentiment']==var),1], bins=30, kde=False, 
                 color='red', label='Neutral')
    sns.distplot(essai_.predict_proba(X_train)[(y_train['sentiment']==var),2], bins=30, kde=False, 
                 color='blue', label='Positive')
    plt.legend()
    plt.title(f'Histogram of {var} by true sentiment');
../_images/Projet_FF-Copy1_191_0.png ../_images/Projet_FF-Copy1_191_1.png ../_images/Projet_FF-Copy1_191_2.png

Stratégie : on maximise le seuil pour la décision positive, puis sur les non positifs, on maximise le seuil pour les négatifs, le reste est neutre

# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')
def find_optimal_f1_thresholds(pipe, X, y):
    
    probs = pipe.predict_proba(X)
    
    # keep probabilities for the positive outcome only
    pos_probs = probs[:,2]
    # define thresholds
    thresholds = np.arange(0, 1, 0.001)
    # evaluate each threshold
    scores = [f1_score([(1 if i==1 else 0) for i in y ], to_labels(pos_probs, t)) for t in thresholds]
    # get best threshold
    ix = np.argmax(scores)

    
    res = {'pos_threshold' : thresholds[ix], 'pos_f1' : scores[ix] }
    
    # keep probabilities for the positive outcome only
    neg_probs = probs[:,0]
    # define thresholds
    thresholds = np.arange(0, 1, 0.001)
    # evaluate each threshold
    scores = [f1_score([(1 if i==-1 else 0) for i in y ], to_labels(neg_probs, t)) for t in thresholds]
    # get best threshold
    ix = np.argmax(scores)

    
    res.update({'neg_threshold' : thresholds[ix], 'neg_f1' : scores[ix] })
    
    return res
    
thres = find_optimal_f1_thresholds(roBERTa_RF_, X_train_compound, y_train['sentiment'])
thres
{'pos_threshold': 0.392,
 'pos_f1': 0.8337924701561066,
 'neg_threshold': 0.423,
 'neg_f1': 0.8066423449490502}
y_train['sentiment']
0        0
1       -1
2       -1
3       -1
4       -1
        ..
27475   -1
27476   -1
27477    1
27478    1
27479    0
Name: sentiment, Length: 27480, dtype: int64
roBERTa_RF_.predict_proba(X_train_compound)
array([[0.014984  , 0.9562709 , 0.02874509],
       [0.93819736, 0.05573562, 0.00606702],
       [0.9380295 , 0.05034958, 0.01162092],
       ...,
       [0.00301866, 0.08883453, 0.90814681],
       [0.10620177, 0.4432684 , 0.45052983],
       [0.01658131, 0.36426037, 0.61915832]])
def sentiment_predict(pipe, X, dict_thres):
    seuil_pos=dict_thres['pos_threshold']
    seuil_neg=dict_thres['neg_threshold']

    probs = pipe.predict_proba(X)

    y_test_pred_pos = to_labels(probs[:,2], seuil_pos)
    y_test_pred_neg = to_labels(probs[:,0], seuil_neg)

    y_test_pred = y_test_pred_pos
    y_test_pred[(y_test_pred_pos==0)] = -y_test_pred_neg[(y_test_pred_pos==0)]
    return y_test_pred
y_test_pred = sentiment_predict(roBERTa_RF_, X_test_compound,thres)
f1_score(y_test, y_test_pred, average='macro')
0.7582696304640235
thres_xgb = find_optimal_f1_thresholds(roBERTa_xgb_, X_train_compound, y_train['sentiment'])
y_test_pred_xgb = sentiment_predict(roBERTa_xgb_, X_test_compound,thres_xgb)
f1_score(y_test, y_test_pred_xgb, average='macro')
0.7589563821278196

SHAP

import shap

shap.initjs()

sujets

import gensim.corpora as corpora# Create Dictionary
id2word = corpora.Dictionary(data_words)